# All libraries are present here:
knitr::opts_chunk$set(echo = TRUE)
library(readxl)
library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 4.0.5
library(moments)
library(Hotelling)
## Loading required package: corpcor
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x dplyr::summarise() masks Hotelling::summarise()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Reading all the spreadsheets
bankruptcy = read_xlsx("bankruptcy.xlsx")
census = read_xlsx("census.xlsx")
skull = read_xlsx("skulldata.xlsx")
wordpar = read_xlsx("wordparity.xlsx")

1)

1a)

Getting a sample

# Skull data work
samplevals = data.frame()
for(i in c(1:3)) {
  setframe = skull[skull$Time == i, ]
  setvals = sample.int(length(setframe$Time), 15)
  sampleframe = setframe[setvals, ]
  samplevals = rbind(samplevals, sampleframe)
}
samplevals
## # A tibble: 45 × 5
##    MaxBr BasHt BasLength NasHt  Time
##    <dbl> <dbl>     <dbl> <dbl> <dbl>
##  1   126   133       102    51     1
##  2   128   132        93    53     1
##  3   136   143       100    54     1
##  4   125   136        93    48     1
##  5   132   136       100    50     1
##  6   126   129       109    51     1
##  7   138   135       100    55     1
##  8   124   138       101    46     1
##  9   141   140       100    51     1
## 10   130   130       104    49     1
## # … with 35 more rows

Finding the overall mean of the sample

overallmean = colMeans(samplevals[1:4])
overallmean
##     MaxBr     BasHt BasLength     NasHt 
## 132.86667 133.73333  97.51111  50.64444

Level Means

l1means = colMeans(samplevals[samplevals$Time == 1,1:4])
l2means = colMeans(samplevals[samplevals$Time == 2,1:4])
l3means = colMeans(samplevals[samplevals$Time == 3,1:4])

l1means
##     MaxBr     BasHt BasLength     NasHt 
## 130.93333 134.93333  99.26667  50.86667
l2means
##     MaxBr     BasHt BasLength     NasHt 
## 133.53333 133.46667  98.66667  50.20000
l3means
##     MaxBr     BasHt BasLength     NasHt 
## 134.13333 132.80000  94.60000  50.86667

Treatment sum of squares

treatment = (15 * (sum((overallmean - l1means)^2)))+(15 *(sum((overallmean - l2means) ^2)) )+(15 * (sum((overallmean - l3means)^2)))
treatment
## [1] 320.3556

Residual sum of squares

resid = ((sum((samplevals[samplevals$Time == 1,1:4] - l1means)^2))) + ((sum((samplevals[samplevals$Time == 2,1:4] - l2means) ^2))) + ((sum((samplevals[samplevals$Time == 3,1:4] - l3means)^2)))
resid
## [1] 406845.5

Total Sum of squares

total = ((sum((samplevals[samplevals$Time == 1,1:4] - overallmean)^2))) + ((sum((samplevals[samplevals$Time == 2,1:4] - overallmean) ^2))) + ((sum((samplevals[samplevals$Time == 3,1:4] - overallmean)^2)))
total
## [1] 406601.4

Manova Table:

titles = c("Treatment, B", "Residual, W", "Total")
SS = c(treatment, resid, total)
df = c(3, 36, 42)
data.frame(titles, SS, df)
##         titles          SS df
## 1 Treatment, B    320.3556  3
## 2  Residual, W 406845.4667 36
## 3        Total 406601.3778 42

Wilks lambda

wlambda = (resid) / (resid + treatment)
wlambda
## [1] 0.9992132

1b)

2)

2a)

setvals2 = sample.int(length(wordpar$worddiff), 20)
samplevals2 = wordpar[setvals2, ]
samplevals2
## # A tibble: 20 × 4
##    worddiff wordsame Arabicdiff Arabicsame
##       <dbl>    <dbl>      <dbl>      <dbl>
##  1     751      744        683        553 
##  2     731      662        662.       624 
##  3    1172.     896.       926        766 
##  4     774      735        671        612 
##  5    1096.    1009       1076        983 
##  6    1044      909        865        839 
##  7     814.     750.       711        625 
##  8    1225     1179       1037        906.
##  9     995      875        678        659 
## 10     976.     872.       814        735 
## 11     656.     660.       572        539 
## 12     869      860.       691        601 
## 13     708      669        657        572.
## 14     767      738.       724        639 
## 15     726      674        663        583 
## 16     982      894        831        640 
## 17    1056      930.       833        826 
## 18    1126      954        888        728 
## 19     747      752.       777        688.
## 20    1408.    1311        854        986
samplevals2$word = samplevals2$worddiff - samplevals2$wordsame
samplevals2$Arabic = samplevals2$Arabicsame -  samplevals2$Arabicdiff
dbardiffvsame = c(((1/20) * sum(samplevals2$word)), ((1/20)*(sum(samplevals2$Arabic))))
dbardiffvsame
## [1]  77.45 -75.50
varianceword = sum((samplevals2$word - dbardiffvsame[1])^2) * (1 / (20 - 1))
covwordarab = sum((samplevals2$word - dbardiffvsame[1]) * (samplevals2$Arabic - dbardiffvsame[2])) * (1 / (20 -1))
variancearab = sum((samplevals2$Arabic - dbardiffvsame[2])^2) * (1 / (20 - 1))
varianceword
## [1] 4594.918
covwordarab
## [1] -478.0395
variancearab
## [1] 4820.632

Doing the rest of the calculation by hand, we have: (T^2) = n * (Dbar’) * (inv(S)) * (Dbar) = 40.2

F-value is:

(((19)*(2))/(18))*(qf(0.95, 2, 18))
## [1] 7.504065

By this, we know that the reactions for same vs different numbers are different, on average.

2b)

samplevals2$diff = samplevals2$worddiff - samplevals2$Arabicdiff
samplevals2$same = samplevals2$wordsame - samplevals2$Arabicsame

dbarwordarab = c(((1/20) * sum(samplevals2$diff)), ((1/20)*(sum(samplevals2$same))))
dbarwordarab
## [1] 150.525 148.575
variancediff = sum((samplevals2$diff - dbarwordarab[1])^2) * (1 / (20 - 1))
covdiffsame = sum((samplevals2$diff - dbarwordarab[1]) * (samplevals2$same - dbarwordarab[2])) * (1 / (20 -1))
variancesame = sum((samplevals2$same - dbarwordarab[2])^2) * (1 / (20 - 1))
variancediff
## [1] 16694.85
covdiffsame
## [1] 7691.445
variancesame
## [1] 7147.507

Doing the rest of the calculation by hand, we have: (T^2) = n * (Dbar’) * (inv(S)) * (Dbar) = 43.2

By this, we know that the reactions for written vs arabic numbers are also different, on average.

3)

setvals3 = sample.int(length(census$totpop), 40)
samplevals3 = census[setvals3, ]
samplevals3
## # A tibble: 40 × 5
##    totpop  prof   emp   gov homeval
##     <dbl> <dbl> <dbl> <dbl>   <dbl>
##  1   6.03  3.1   76.0  25      1.08
##  2   1.64 16.7   64.6  49.4    3.13
##  3   3.38  2.17  66.1  22.6    1.44
##  4   4.36  1.67  65.4  23.7    1.07
##  5   5.54  9.25  74.9  31      2.23
##  6   5.44  2.93  73.6  22.3    1.65
##  7   5.48  1.34  77.4  21.6    1.32
##  8   3.58  3.38  65.6  26.1    1.31
##  9   5.34  3.41  72.6  20.1    1.66
## 10   6.29  2.6   64.3  27.4    1.78
## # … with 30 more rows

first the covariance matrix:

covmatcensus = cov(samplevals3)
covmatcensus
##              totpop      prof         emp        gov     homeval
## totpop   3.19938045 -1.275331   1.7773157  -2.691683 -0.02120096
## prof    -1.27533109 10.990055  -2.1098702  11.838561  1.49675673
## emp      1.77731571 -2.109870  51.8161892 -41.694099 -0.29842186
## gov     -2.69168269 11.838561 -41.6940994 109.634455  1.86011859
## homeval -0.02120096  1.496757  -0.2984219   1.860119  0.34017122

and now the eigenvalues + vectors:

decmp = eigen(covmatcensus)
decmp
## eigen() decomposition
## $values
## [1] 132.656873  30.638647   9.592309   2.967553   0.124869
## 
## $vectors
##             [,1]         [,2]       [,3]         [,4]          [,5]
## [1,]  0.02558977 -0.006026141  0.1524519  0.986522961 -0.0532891113
## [2,] -0.09433218 -0.171562548 -0.9602729  0.142290487 -0.1389143399
## [3,]  0.45856771 -0.881475104  0.1075368 -0.033945944 -0.0008957122
## [4,] -0.88314857 -0.439108416  0.1648970 -0.005441539 -0.0034302049
## [5,] -0.01452079 -0.026747284 -0.1260131  0.073102171  0.9888632404

Based on the eigenvalues present, I would want to have 3 principle components present.

a1x = c(0,-0.955, 0.0241, -0.157, 0.0442, -0.0539)
y1 = sum(a1x^2)
y1
## [1] 0.9421137
a2x = c(0, 0.299, 0.941, 0.0554, -0.185, -0.155)
y2 = sum(a2x^2)
y2
## [1] 1.036201
a3x = c(0,-0.0784, 0.0639, 0.791, 0.586, -0.144)
y3 = sum(a3x^2)
y3
## [1] 1.000043

3b)

3c)

I can’t see anything in the slides pertaining to a confidence region for this question. In this case, I will be skipping it and moving ahead to other questions.

4)

4a)

Calculating by hand, I got: L = [32.5 51.6 31.1 134 0.639]

L * L’ = [1060 1680 1010 4370 20.8 1680 2660 1060 6940 33 1010 1600 966 4180 19.9 4370 6940 4180 18100 85.9 20.8 33 19.9 85.9 0.408]

4b)

4c)

5)

5a)

bankruptcy
## # A tibble: 46 × 5
##     cftd  nita  cacl  cans   pop
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -0.45 -0.41  1.09  0.45     0
##  2 -0.56 -0.31  1.51  0.16     0
##  3  0.06  0.02  1.01  0.4      0
##  4 -0.07 -0.09  1.45  0.26     0
##  5 -0.1  -0.09  1.56  0.67     0
##  6 -0.14 -0.07  0.71  0.28     0
##  7  0.04  0.01  1.5   0.71     0
##  8 -0.07 -0.06  1.37  0.4      0
##  9  0.07 -0.01  1.37  0.34     0
## 10 -0.14 -0.14  1.42  0.43     0
## # … with 36 more rows
avai_id = which(!is.na(bankruptcy$nita))
bankruptcy1 = bankruptcy[avai_id, ]
pop1vals = sample.int(length(bankruptcy$pop[bankruptcy1$pop == 0]), 11)
pop2vals = sample.int(length(bankruptcy$pop[bankruptcy1$pop == 1]), 15)
pop1 = bankruptcy1[bankruptcy1$pop == 0, ]
pop2 = bankruptcy1[bankruptcy1$pop == 1, ]
samplepop1 = pop1[pop1vals, ]
samplepop2 = pop2[pop2vals, ]
samplepop = rbind(samplepop1, samplepop2)
samplepop1
## # A tibble: 11 × 5
##     cftd  nita  cacl  cans   pop
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -0.1  -0.09  1.56  0.67     0
##  2 -0.07 -0.09  1.45  0.26     0
##  3  0.06  0.02  1.01  0.4      0
##  4  0.37  0.11  1.99  0.38     0
##  5 -0.56 -0.31  1.51  0.16     0
##  6  0.12  0.11  1.14  0.17     0
##  7  0.07  0.02  1.31  0.25     0
##  8  0.15  0.05  1.88  0.27     0
##  9 -0.28 -0.27  1.27  0.51     0
## 10 -0.14 -0.14  1.42  0.43     0
## 11  0.01  0     2.15  0.7      0
samplepop2
## # A tibble: 15 × 5
##     cftd  nita  cacl  cans   pop
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  0.54  0.11  2.33  0.48     1
##  2  0.29  0.06  1.84  0.38     1
##  3  0.15  0.05  2.17  0.55     1
##  4  0.14  0.07  2.61  0.52     1
##  5  0.31  0.05  4.45  0.69     1
##  6  0.56  0.11  4.29  0.44     1
##  7  0.47  0.14  2.92  0.45     1
##  8  0.32  0.07  4.24  0.63     1
##  9  0.17  0.07  1.8   0.52     1
## 10  0.2   0.08  1.99  0.3      1
## 11  0.12  0.05  2.52  0.69     1
## 12  0.15  0.06  2.23  0.56     1
## 13  0.14 -0.03  0.46  0.26     1
## 14  0.51  0.1   2.49  0.54     1
## 15  0.08  0.02  2.01  0.53     1
samplepop
## # A tibble: 26 × 5
##     cftd  nita  cacl  cans   pop
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 -0.1  -0.09  1.56  0.67     0
##  2 -0.07 -0.09  1.45  0.26     0
##  3  0.06  0.02  1.01  0.4      0
##  4  0.37  0.11  1.99  0.38     0
##  5 -0.56 -0.31  1.51  0.16     0
##  6  0.12  0.11  1.14  0.17     0
##  7  0.07  0.02  1.31  0.25     0
##  8  0.15  0.05  1.88  0.27     0
##  9 -0.28 -0.27  1.27  0.51     0
## 10 -0.14 -0.14  1.42  0.43     0
## # … with 16 more rows
meanspop1 = colMeans(samplepop1)
meanspop2 = colMeans(samplepop2)
meanspop1
##        cftd        nita        cacl        cans         pop 
## -0.03363636 -0.05363636  1.51727273  0.38181818  0.00000000
meanspop2
##       cftd       nita       cacl       cans        pop 
## 0.27666667 0.06733333 2.55666667 0.50266667 1.00000000
meanssample = colMeans(samplepop)
meanssample
##       cftd       nita       cacl       cans        pop 
## 0.14538462 0.01615385 2.11692308 0.45153846 0.57692308
correctpop1vals = samplepop1 - meanssample
correctpop1vals
##           cftd        nita        cacl       cans         pop
## 1  -0.24538462 -0.10615385 -0.55692308  0.2184615 -0.57692308
## 2  -0.08615385 -2.20692308  0.99846154 -0.3169231 -0.14538462
## 3  -2.05692308 -0.43153846  0.43307692  0.2546154 -0.01615385
## 4  -0.08153846 -0.46692308  1.84461538  0.3638462 -2.11692308
## 5  -1.13692308 -0.45538462  1.49384615 -1.9569231 -0.45153846
## 6  -0.02538462  0.09384615 -0.97692308 -0.2815385 -0.57692308
## 7   0.05384615 -2.09692308  0.85846154 -0.3269231 -0.14538462
## 8  -1.96692308 -0.40153846  1.30307692  0.1246154 -0.01615385
## 9  -0.73153846 -0.84692308  1.12461538  0.4938462 -2.11692308
## 10 -0.71692308 -0.28538462  1.40384615 -1.6869231 -0.45153846
## 11 -0.13538462 -0.01615385  0.03307692  0.2484615 -0.57692308
correctpop2vals = samplepop2 - meanssample
correctpop2vals
##           cftd        nita        cacl        cans        pop
## 1   0.39461538 -0.03538462  2.18461538  0.33461538  0.8546154
## 2   0.27384615  0.04384615  1.82384615  0.36384615  0.9838462
## 3  -1.96692308 -2.06692308  0.05307692 -1.56692308 -1.1169231
## 4  -0.31153846 -0.38153846  2.15846154  0.06846154  0.5484615
## 5  -0.26692308 -0.52692308  3.87307692  0.11307692  0.4230769
## 6   0.41461538 -0.03538462  4.14461538  0.29461538  0.8546154
## 7   0.45384615  0.12384615  2.90384615  0.43384615  0.9838462
## 8  -1.79692308 -2.04692308  2.12307692 -1.48692308 -1.1169231
## 9  -0.28153846 -0.38153846  1.34846154  0.06846154  0.5484615
## 10 -0.37692308 -0.49692308  1.41307692 -0.27692308  0.4230769
## 11 -0.02538462 -0.09538462  2.37461538  0.54461538  0.8546154
## 12  0.13384615  0.04384615  2.21384615  0.54384615  0.9838462
## 13 -1.97692308 -2.14692308 -1.65692308 -1.85692308 -1.1169231
## 14  0.05846154 -0.35153846  2.03846154  0.08846154  0.5484615
## 15 -0.49692308 -0.55692308  1.43307692 -0.04692308  0.4230769
correctsamplevals = samplepop - meanssample
correctsamplevals
##           cftd         nita        cacl        cans         pop
## 1  -0.24538462 -0.106153846 -0.55692308  0.21846154 -0.57692308
## 2  -0.08615385 -2.206923077  0.99846154 -0.31692308 -0.14538462
## 3  -2.05692308 -0.431538462  0.43307692  0.25461538 -0.01615385
## 4  -0.08153846 -0.466923077  1.84461538  0.36384615 -2.11692308
## 5  -1.13692308 -0.455384615  1.49384615 -1.95692308 -0.45153846
## 6  -0.02538462  0.093846154 -0.97692308 -0.28153846 -0.57692308
## 7   0.05384615 -2.096923077  0.85846154 -0.32692308 -0.14538462
## 8  -1.96692308 -0.401538462  1.30307692  0.12461538 -0.01615385
## 9  -0.73153846 -0.846923077  1.12461538  0.49384615 -2.11692308
## 10 -0.71692308 -0.285384615  1.40384615 -1.68692308 -0.45153846
## 11 -0.13538462 -0.016153846  0.03307692  0.24846154 -0.57692308
## 12  0.52384615 -2.006923077  1.87846154 -0.09692308  0.85461538
## 13 -1.82692308 -0.391538462  1.26307692  0.23461538  0.98384615
## 14 -0.30153846 -0.526923077  2.02461538  0.53384615 -1.11692308
## 15 -0.43692308 -0.075384615  2.59384615 -1.59692308  0.54846154
## 16  0.16461538  0.033846154  2.33307692  0.23846154  0.42307692
## 17  0.54384615 -2.006923077  3.83846154 -0.13692308  0.85461538
## 18 -1.64692308 -0.311538462  2.34307692  0.30461538  0.98384615
## 19 -0.13153846 -0.506923077  4.09461538  0.61384615 -1.11692308
## 20 -0.40692308 -0.075384615  1.78384615 -1.59692308  0.54846154
## 21  0.05461538  0.063846154 -0.12692308 -0.15153846  0.42307692
## 22  0.10384615 -2.066923077  2.06846154  0.11307692  0.85461538
## 23 -1.96692308 -0.391538462  1.65307692  0.41461538  0.98384615
## 24 -0.31153846 -0.606923077  0.31461538  0.24384615 -1.11692308
## 25 -0.06692308 -0.045384615  2.47384615 -1.57692308  0.54846154
## 26 -0.06538462  0.003846154 -0.10692308  0.07846154  0.42307692
cov(correctpop1vals)
##             cftd         nita       cacl         cans         pop
## cftd  0.59170584 -0.151888913 -0.1856146  0.034700958 -0.17981017
## nita -0.15188891  0.612056105 -0.2417393  0.007512017 -0.09483719
## cacl -0.18561457 -0.241739295  0.7995777 -0.215452458 -0.18232386
## cans  0.03470096  0.007512017 -0.2154525  0.678152222 -0.20629253
## pop  -0.17981017 -0.094837187 -0.1823239 -0.206292528  0.57111655
cov(correctpop2vals)
##           cftd      nita      cacl      cans       pop
## cftd 0.7180692 0.6727220 0.8368467 0.6557682 0.6505520
## nita 0.6727220 0.6470101 0.7486209 0.6353453 0.6282953
## cacl 0.8368467 0.7486209 1.9505099 0.7943957 0.7103938
## cans 0.6557682 0.6353453 0.7943957 0.6397930 0.6177582
## pop  0.6505520 0.6282953 0.7103938 0.6177582 0.6112157
cov(correctsamplevals)
##             cftd        nita        cacl        cans         pop
## cftd  0.60950310 -0.18846646  0.08081515 -0.04387453 -0.08558987
## nita -0.18846646  0.58018789 -0.25565593 -0.07367497 -0.08177217
## cacl  0.08081515 -0.25565593  1.50603442 -0.09270030  0.20866370
## cans -0.04387453 -0.07367497 -0.09270030  0.60897230 -0.15525956
## pop  -0.08558987 -0.08177217  0.20866370 -0.15525956  0.82069033
solve(cov(correctpop1vals))
##          cftd     nita     cacl     cans      pop
## cftd 4.408537 2.887534 3.203368 1.841689 3.555356
## nita 2.887534 4.053395 3.106021 1.771759 3.213745
## cacl 3.203368 3.106021 4.436073 2.365448 3.794919
## cans 1.841689 1.771759 2.365448 2.929763 2.687453
## pop  3.555356 3.213745 3.794919 2.687453 5.586211
solve(cov(correctpop2vals))
##             cftd        nita       cacl       cans         pop
## cftd  105.514680  -275.20844  -6.454224   49.72244   127.84026
## nita -275.208442  2009.02319 -12.172110  -13.19278 -1744.76116
## cacl   -6.454224   -12.17211   2.386188  -12.29536    29.03544
## cans   49.722442   -13.19278 -12.295363  131.65972  -158.13956
## pop   127.840262 -1744.76116  29.035437 -158.13956  1785.16985
solve(cov(correctsamplevals))
##            cftd      nita       cacl      cans        pop
## cftd  1.9299985 0.7083343 -0.0104728 0.3079973  0.3327871
## nita  0.7083343 2.1922036  0.3204672 0.4400061  0.2940607
## cacl -0.0104728 0.3204672  0.7450532 0.1166224 -0.1365313
## cans  0.3079973 0.4400061  0.1166224 1.8356310  0.3935786
## pop   0.3327871 0.2940607 -0.1365313 0.3935786  1.3916639

5b)

confusionMatrix(as.matrix(solve(cov(correctpop1vals))))
## Confusion Matrix and Statistics
## 
##          cftd     nita     cacl     cans      pop
## cftd 4.408537 2.887534 3.203368 1.841689 3.555356
## nita 2.887534 4.053395 3.106021 1.771759 3.213745
## cacl 3.203368 3.106021 4.436073 2.365448 3.794919
## cans 1.841689 1.771759 2.365448 2.929763 2.687453
## pop  3.555356 3.213745 3.794919 2.687453 5.586211
## 
## Overall Statistics
##                                   
##                Accuracy : 0.2736  
##                  95% CI : (NA, NA)
##     No Information Rate : NA      
##     P-Value [Acc > NIR] : NA      
##                                   
##                   Kappa : 0.0867  
##                                   
##  Mcnemar's Test P-Value : 1       
## 
## Statistics by Class:
## 
##                      Class: cftd Class: nita Class: cacl Class: cans Class: pop
## Sensitivity              0.27733     0.26964     0.26240     0.25265    0.29654
## Specificity              0.81582     0.82638     0.79679     0.87002    0.77703
## Pos Pred Value           0.27733     0.26964     0.26240     0.25265    0.29654
## Neg Pred Value           0.81582     0.82638     0.79679     0.87002    0.77703
## Prevalence               0.20310     0.19206     0.21600     0.14816    0.24068
## Detection Rate           0.05633     0.05179     0.05668     0.03743    0.07137
## Detection Prevalence     0.20310     0.19206     0.21600     0.14816    0.24068
## Balanced Accuracy        0.54657     0.54801     0.52959     0.56133    0.53679